*** How worried should we be? The implications of fabricated survey data for political science
*** Figure A4. Similarity among Fraudulent and All Clean Interviews
*** Must have percentmatch package installed

set more off

* set directory to location of dataset in following line
cd "C:\~\Downloads\"

use "VEN_fraud_percentmatch.dta", clear

** clean up
replace likelyfraud = 0 if likelyfraud == 1 & status != "Canceled"

** gen age in years
gen birth_yr = q2y3 if consent2 == 1
replace birth_yr = q2y if consent1 == 1
replace birth_yr = . if birth_yr == 888888

gen svy_date = date(date, "MDYhm")
format svy_date %td
gen year = year(svy_date)

gen upload_time = Clock(upload, "MDYhm")
format upload_time %tC
gen upload_hour = hhC(upload_time)

gen age_yrs = year - birth_yr
gen edad = 1 if age_yrs >= 18 & age_yrs <=25
replace edad = 2 if age_yrs >= 26 & age_yrs <=35
replace edad = 3 if age_yrs >= 36 & age_yrs <=45
replace edad = 4 if age_yrs >= 46 & age_yrs <=55
replace edad = 5 if age_yrs >= 56 & age_yrs <=65
replace edad = 6 if age_yrs >= 66 
label var edad "Age Cohort"
label define agegroup 1 "18-25 years old" 2 "26-35 years old" 3 "36-45 years old" 4 "46-55 years old" 5 "56-65 years old" 6 "66+ years old" 
label values edad agegroup

* education level
gen ed_level = 1 if inlist(ed, 0, 1, 2, 3, 4, 5, 6)
replace ed_level = 2 if inlist(ed, 7, 8, 9, 10, 11, 12)
replace ed_level = 3 if inlist(ed, 13, 14, 15, 16, 17, 18)
label var ed_level "Education Level"
label define edlevel 1 "None or Primary Ed." 2 "Secondary Ed." 3 "University+ Ed."
label values ed_level edlevel
label var ed "Years of Education"
replace ed = . if inlist(ed, 888888, 988888, 999999)

** matching
* first flag cases that were canceled but not labeled as fraudulent
set seed 38263563
gen canceled_nonfraud = 1 if status == "Canceled" & likelyfraud == 0
replace canceled_nonfraud = 0 if canceled_nonfraud == .

cem upm1 (#0) q1 (#0) birth_yr (#3) if canceled_nonfraud == 0, tr(likelyfraud) k2k

cem upm1 (#0) q1 (#0) edad (0 2.5 4.5 7) if canceled_nonfraud == 0, tr(likelyfraud) k2k

cem upm1 (#0) q1 (#0) agequota (#0) if canceled_nonfraud == 0, tr(likelyfraud) k2k

** label subsets of data
gen clean_data = 1 if status != "Canceled"
replace clean_data = 0 if status == "Canceled"

gen compromised_data = 1 if likelyfraud == 1 & cem_matched == 1
replace compromised_data = 1 if status == "Initially Approved" & cem_matched == 0
replace compromised_data = 0 if status == "Initially Approved" & cem_matched == 1

* percentmatch from Kuriakose and Robbins 2015
* excludes mil10a1 mil10e1 mil10oas1 mil10un1 mil10oas2 mil10un2 mil10a2 mil10e2
* excludes dst1b1 drk11 env1c1 env2b1 env1c2 env2b2 dst1b2 drk12
percentmatch ls3 a4 soct2 idio2 np1 cp6 cp7 cp8 cp13 cp20 it1 l1 venl2 venl3 prot3 venprot10 venprot12 venprot11 ///
jc10 jc13 jc15a vic1ext vic1exta aoj11 aoj12 b1 b2 b3 b4 b6 b43 b11 b12 b13 b18 b21 b21a b32 b37 b47a venb11 venb10 venb51 ///
venvb18 venvb19 venvb20 pr3dn pr3en polz1 polz1a m1 m2 sd2new2 sd3new2 sd6new2 infrax infra3 ros1 ros4 ing4 eff1 eff2 ///
vengrp15 vengrp16 aoj22new media3 media4 media4b media1 media2 media2b ///
pn4 pn5 w14a e5 e15 e3 e16 d1 d2 d3 d4 d5 d6 lib1 lib2b lib2c lib4 dem11 aut1 exc2 exc6 exc20 exc11 exc13 exc14 exc15 exc16 exc18 ///
exc7new vicbar7 vicbar7f fear11 capital1 iga1 igaaoj22 inf1 vb1 vb2 vb3n vb6 venvb7 vb10 vb11 pol1 fex2 vb20 ///
venvb10 venvb21 venvb22 vengrp1 vengrp2 vengrp3 vengrp4 vengrp5 vengrp6 vengrp7 vengrp8 vengrp9 vengrp10 vengrp11 vengrp12 ///
vengrp13 vengrp14 for5n venesc2b venesc3 venps1 venps2 venps3 venct1 venct2 venct3 ///
wf1 cct1b ed ed2 q5a q5b q3c ocup4a ocup1a q10g q10new q10a q14 q10d q10e q11n q12c q12bn q12 q12m q12f etid www1 ///
gi0 pr1 r3 r4 r4a r5 r6 r7 r8 r12 r14 r15 r18 r1 r16, gen(pmatch) id(sbjnum) matchedid(m_id)

** Figure A4 **
twoway (hist pmatch if clean_data == 1, discrete color(green)) ///
	   (hist pmatch if cem_matched == 1 & likelyfraud == 1, discrete fcolor(none) lcolor(black)), ///
	   legend(order(1 "Full Clean" 2 "Fraudulent")) xtitle("percentmatch") graphregion(color(white))